Note: Grading is based both on your graphs and verbal explanations. Follow all best practices as discussed in class, including choosing appropriate parameters for all graphs. Do not expect the assignment questions to spell out precisely how the graphs should be drawn. Sometimes guidance will be provided, but the absense of guidance does not mean that all choices are ok.
Read Graphical Data Analysis with R, Ch. 4, 5
[5 points]
Data: ames in the openintro package
library(openintro)
library(tidyverse)
library(scales)
library(dplyr)
library(plotly)
data(ames)
c<-ggplot(ames, aes(x=reorder(Roof.Style, Roof.Style, function(x)-length(x)))) + geom_bar(fill = "lightblue")+ theme_classic()+
labs(x='Roof Styles', y ='Count') +
ggtitle("Count of each roof style") +
theme(plot.title = element_text(hjust = 0.5))
c
data(ames)
ames$Mo.Sold <- as.Date(paste0("2018-", ames$Mo.Sold, "-1"))
c<-ggplot(ames, aes(x=Mo.Sold)) + geom_bar(fill = "lightblue")+ theme_classic()+
labs(x='Months', y ='Count') +
ggtitle("Count of each Month") +
theme(plot.title = element_text(hjust = 0.5))+
scale_x_date(labels = date_format("%b"))
c
"Ex" "Fa" "Gd" "Po" "TA" as levels.data(ames)
ls <- c()
vals<-c("Ex","Fa","Gd","Po","TA")
for(i in 1:ncol(ames)){
if (all(vals %in% sapply(ames[,i],levels))){
ls<-c(ls,i)
}
}
colnames(ames[,ls])
## [1] "Exter.Cond" "Bsmt.Qual" "Bsmt.Cond" "Heating.QC" "Kitchen.Qual"
## [6] "Fireplace.Qu" "Garage.Qual" "Garage.Cond"
facet_wrap() to display the frequency distribution of all variables from part c). (Hint: transform the data first with pivot_longer())library(data.table)
data(ames)
df<-ames
colnames=c('PID',"Exter.Cond","Bsmt.Qual","Bsmt.Cond","Heating.QC","Kitchen.Qual","Fireplace.Qu","Garage.Qual","Garage.Cond")
df<-df[colnames]
df%>%pivot_longer(cols=!PID,names_to='Type',values_to = "num")%>%na.omit()->df
df<-df[df$num != "", ]
ggplot(df, aes(x=reorder(num, num, function(x)-length(x))))+
geom_bar(color='black')+
facet_wrap(~Type, nrow=2)+
ggtitle("Frequency distribution of select variables") +
theme(plot.title = element_text(hjust = 0.5))+
labs(x='Levels', y ='Count')
[12 points]
Data: seattlepets in the openintro package
data("seattlepets")
df<-seattlepets
df<-df[!is.na(df$animal_name), ]
df %>% filter(species == 'Dog')%>%group_by(animal_name) %>%tally(sort=TRUE)%>%slice(1:30) -> df_dog
df %>%filter(species == 'Cat')%>%group_by(animal_name) %>%tally(sort=TRUE)%>%slice(1:30) -> df_cat
ggplot(df_dog, aes(x = n, y=reorder(animal_name,n))) +
geom_point()+ggtitle("Cleveland plot for Dogs") +
theme(plot.title = element_text(hjust = 0.5))+
labs(x='Count', y ='Names')+theme_linedraw()
ggplot(df_cat, aes(x = n, y=reorder(animal_name,n))) +
geom_point()+ggtitle("Cleveland plot for Cats") +
labs(x='Count', y ='Names')+theme_linedraw()
df2<-df[!(df$species=="Pig" | df$species=="Goat"),]
df2<-df2[!is.na(df2$animal_name), ]
df2<-df2[,c('species','animal_name')]
df2%>%group_by(animal_name,species)%>%tally()%>%filter(n()>1)%>%mutate(freq = n / sum(n))%>%arrange(desc(freq))%>%filter(species == 'Dog')->df3
df3=df3[0:30,]
ggplot(df3, aes(x = freq, y = reorder(animal_name, freq))) +
geom_point()+ggtitle("Cleveland plot for most DOG names") +
theme(plot.title = element_text(hjust = 0.5))+
labs(x='Count', y ='Names')+theme_linedraw()
df_total=df[!is.na(df$animal_name), ]
df_total%>%group_by(animal_name)%>%tally()%>%arrange(desc(n))%>%slice(1:30)->df_total
df %>% filter(species == 'Dog')%>%group_by(animal_name) %>%tally(sort=TRUE)->df_dog
df %>% filter(species == 'Cat')%>%group_by(animal_name) %>%tally(sort=TRUE)->df_cat
merge(
x=df_dog,
y=df_total,
by.x='animal_name',
by.y='animal_name'
)->df_total
merge(
x=df_cat,
y=df_total,
by.x='animal_name',
by.y='animal_name'
)->df_total
ggplot(df_total)+geom_point(aes(x=n.y,y=reorder(animal_name,n.y)),color='red')+geom_point(aes(x=n.x,y=animal_name),color='blue')+geom_point(aes(x=n,y=animal_name),color='green')+ggtitle("Cleveland plot for Popular names") +
labs(x='Count', y ='Names')+theme_linedraw()
merge(
x=df_cat,
y=df_dog,
by.x='animal_name',
by.y='animal_name'
)->df_pop
ggplot(df_pop)+geom_point(aes(x=n.y,y=n.x),alpha=0.5,colour="black")+geom_abline(intercept=0, slope=1)+ggtitle("Scatterplot for Cat names vs Dog names") +
labs(x='Number of Dogs with the name', y ='Number of Cats with the name')+theme_linedraw()
df_pop$frac<-df_pop$n.x/df_pop$n.y
ggplot(df_pop)+geom_point(aes(x=n.y,y=n.x, text=animal_name),col = ifelse(df_pop$frac < 0.90, 'red',ifelse(df_pop$frac>1.10,'green','blue')))+geom_abline(intercept=0, slope=1)+ggtitle("Scatterplot for Cat names vs Dog names") +
theme(plot.title = element_text(hjust = 0.5))+
labs(x='Number of Dogs with the name', y ='Number of Cats with the name')->x
ggplotly(x,tooltip='text') %>% config(displayModeBar = F)
The convention for ‘neutral’, ‘cat’, ‘dog’ points are colored blue,green and red respectively. We used an interactive plot so each point can be examined better. This increases visibility in dense areas as well. On hovering cursor over the point you can see the animal name and the color of its class.
[6 points]
Data: ames in the openintro package
For all, adjust parameters to the levels that provide the best views of the data.
Draw four plots of price vs. area with the following variations:
base <- ames %>% ggplot(aes(x=area, y=price/1000))
alpha.scatter <- base +
geom_point(size=1.5, alpha=0.4, colour="darkblue") +
xlab('Living Area in square feet') +
ylab('Price (in thousand dollars)') +
theme_bw()
scatter
scatter +
geom_density2d(size=0.6, colour="lightblue", bins=8)+
xlab('Living Area in square feet') +
ylab('Price (in thousand dollars)') +
ggtitle('Scatterplot of price vs area with a focus on density contour lines') +
theme_bw()
base +
geom_hex(bins=15) +
scale_fill_gradient(low = "skyblue", high = "darkblue") +
xlab('Living Area in square feet') +
ylab('Price (in thousand dollars)')+
ggtitle('Hexagonal heatmap of price vs area') +
theme_bw()
base+geom_bin2d(bins=15) +
scale_fill_gradient(low = "skyblue", high = "darkblue") +
xlab('Living Area in square feet') +
ylab('Price (in thousand dollars)')+
ggtitle('Square heatmap of price vs area') +
theme_bw()
[7 points]
Data: ames in the openintro package
price vs. area) this time faceting on Neighborhood (use facet_wrap(). Add best fitting lines and sort the facets by the slope of the best fitting line from low to high. (Use lm() to get the slopes.)ames_lm <- ames %>%
group_by(Neighborhood) %>%
group_modify(~ broom::tidy(lm(price ~ area, data=.x))) %>%
filter(term == "area")
ames$slope <- ames_lm$estimate[match(ames$Neighborhood,ames_lm$Neighborhood)]
ames %>% ggplot(aes(x=area/1000, y=price/1000)) +
geom_point(size=2, alpha=0.4) +
xlab('Living Area per 1000 square feet') +
ylab('Price (in thousand dollars)') +
facet_wrap(.~reorder(Neighborhood, slope), nrow = 4) +
geom_smooth(method = 'lm', formula = y~x)+
ggtitle('Price vs Area Scatterplot facetted by Neighbourhood and ordered by slope')
ames_mean <- ames %>%
group_by(Neighborhood) %>%
summarise_at(vars(price), list(mean_price = mean))
ames_mean$slope <- ames_lm$estimate[match(ames_mean$Neighborhood,ames_lm$Neighborhood)]
ames_mean %>%
ggplot(aes(x=slope, y=mean_price)) +
geom_point() +
ggtitle('Mean price vs Slope Scatterplot with a data point representing a Neighbourhood') +
theme_bw()
price on area by Neighborhood. Is the \(R^2\) higher in neighborhoods with higher mean housing prices? Are the results the same for slope and \(R^2\)? Explain using examples from the graphs.r_squared_calc <- function(dat_in, area='area', price='price'){
return(summary(lm(price ~ area, data=dat_in))$r.squared)
}
ames <- ames %>%
group_by(Neighborhood) %>%
do(data.frame(area = .$area,
price = .$price,
r_sqrd = r_squared_calc(.)))
ames %>% ggplot(aes(x=area/1000, y=price/1000)) +
geom_point(size=2, alpha=0.4) +
xlab('Living Area per 1000 square feet') +
ylab('Price (in thousand dollars)') +
facet_wrap(.~reorder(Neighborhood, r_sqrd), nrow = 4) +
geom_smooth(method = 'lm', formula = y~x)
ames_mean$r_sqrd <- ames$r_sqrd[match(ames_mean$Neighborhood,ames$Neighborhood)]
ames_mean %>%
ggplot(aes(x=r_sqrd, y=mean_price)) +
geom_point() +
ggtitle('Price vs Area Scatterplot facetted by Neighbourhood and ordered by R^2') +
theme_bw()